* find top N questions that are most likely to be selected by LASSO in K subsamples of X% 
* syntax: select_questions_lasso_stable [y variable] [list of questions in string][path of output file][# bootstrap samples][subsample size as %][datapath][seed]
program select_questions_lasso_stable	
	global yvar `1'
	local array = "`2'"
	local outfile = "`3'"
	local subsamples = `4'
	local samplesize = `5'
	local datapath = "`6'"
	local seed = `7' //from jake; gotta set seed as an input to make things reproducible
	local folds = `8'

	set seed `seed'

	use "`datapath'", clear
	cap drop  __00*
	
	* select variables to use in LASSO
	* defining variables based on whether or not we're reading in variables or questions
	
	global variables
	
	foreach q of local array {
		local imputed `q'
		global variables $variables `imputed'
	}

	*** main algorithm ***

	cap file close out_file //write chosen variables here 
	file open out_file using "${output}/lasso_subsamples_`samplesize'.txt", write replace
	file write out_file "n" _tab "recodings"  _n 
	
	forval iter = 1 / `subsamples'{
		
		use "`datapath'", clear
		cap drop  __00*	
		sample `samplesize'
		
		quietly lassoregress $yvar $variables, numfolds(`folds')
		local lassovars `e(varlist_nonzero)' 
		
		di "iter:`iter'; lassovars: `lassovars'"
		
		local j = 1
		foreach x in `lassovars'{
			if `j'==1{
				local recodings "`x'"
			}
			else{
				local recodings = "`recodings'" + "," + "`x'"
			}	
			local j = `j' + 1
		}
		file write out_file (`iter') _tab "`recodings'"  _n 

		local recodings //FROM JAKE: redefine recodings as blank at the end of the loop, so if there are no lassovars none get displayed

	}
	cap file close out_file 
	
	choose_stable  "${output}/lasso_subsamples_`samplesize'.txt" "`outfile'"
	
end

* helper function to choose the top N questions that were picked by the lasso (provided in a text file)
* syntax: choose_stable [file of lasso output] [datapath]
program choose_stable 
	local infile = "`1'"
	local outfile = "`2'"
	import delimited using "`infile'", delimiter(tab) clear
	
	split recodings, parse(",") gen(q)
	drop recodings
	
	reshape long q, i(n) j(index)
	keep if q !=""
	
	collapse (count) n, by(q)
	gsort -n
	
	//keep if _n <=30
	
	rename q recoding
	rename n times_selected
	
	split recoding, parse("_") gen(q)
	rename q2 question
	keep question recoding times_selected
	
	save "`outfile'", replace
	/* UNCOMMENT TO SAVE DATA ON ALL PULLS
	import delimited using "`infile'", delimiter(tab) clear
	foreach x in $variables {
		gen `x' = 1 if strpos(recodings, "`x'")
		replace `x' = 0 if `x' == .
	}
	drop recodings
	rename n pull

	local all_pulls = subinstr("`outfile'", ".dta", "", .)
	local all_pulls `all_pulls'_allpulls.dta
	save "`all_pulls'", replace
	*/

end
